Modified website_agent to take an array of urls

Maximilian Clarke 11 years ago
parent
commit
19c005fe45
2 changed files with 96 additions and 60 deletions
  1. 72 60
      app/models/agents/website_agent.rb
  2. 24 0
      spec/models/agents/website_agent_spec.rb

+ 72 - 60
app/models/agents/website_agent.rb

@@ -107,85 +107,97 @@ module Agents
107 107
       log "Fetching #{options['url']}"
108 108
       request_opts = { :followlocation => true }
109 109
       request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
110
-      request = Typhoeus::Request.new(options['url'], request_opts)
111 110
 
112
-      request.on_failure do |response|
113
-        error "Failed: #{response.inspect}"
111
+      requests = []
112
+
113
+      if options['url'].kind_of?(Array)
114
+        options['url'].each do |url|
115
+           requests.push(Typhoeus::Request.new(url, request_opts))
116
+        end
117
+      else
118
+        requests.push(Typhoeus::Request.new(options['url'], request_opts))
114 119
       end
115 120
 
116
-      request.on_success do |response|
117
-        body = response.body
118
-        if (encoding = options['force_encoding']).present?
119
-          body = body.encode(Encoding::UTF_8, encoding)
121
+      requests.each do |request|
122
+        request.on_failure do |response|
123
+          error "Failed: #{response.inspect}"
120 124
         end
121
-        doc = parse(body)
122 125
 
123
-        if extract_full_json?
124
-          if store_payload!(previous_payloads(1), doc)
125
-            log "Storing new result for '#{name}': #{doc.inspect}"
126
-            create_event :payload => doc
126
+        request.on_success do |response|
127
+          body = response.body
128
+          if (encoding = options['force_encoding']).present?
129
+            body = body.encode(Encoding::UTF_8, encoding)
127 130
           end
128
-        else
129
-          output = {}
130
-          options['extract'].each do |name, extraction_details|
131
-            if extraction_type == "json"
132
-              result = Utils.values_at(doc, extraction_details['path'])
133
-              log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
134
-            else
135
-              case
136
-              when css = extraction_details['css']
137
-                nodes = doc.css(css)
138
-              when xpath = extraction_details['xpath']
139
-                nodes = doc.xpath(xpath)
131
+          doc = parse(body)
132
+
133
+          if extract_full_json?
134
+            if store_payload!(previous_payloads(1), doc)
135
+              log "Storing new result for '#{name}': #{doc.inspect}"
136
+              create_event :payload => doc
137
+            end
138
+          else
139
+            output = {}
140
+            options['extract'].each do |name, extraction_details|
141
+              if extraction_type == "json"
142
+                result = Utils.values_at(doc, extraction_details['path'])
143
+                log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
140 144
               else
141
-                error "'css' or 'xpath' is required for HTML or XML extraction"
142
-                return
143
-              end
144
-              unless Nokogiri::XML::NodeSet === nodes
145
-                error "The result of HTML/XML extraction was not a NodeSet"
146
-                return
147
-              end
148
-              result = nodes.map { |node|
149
-                if extraction_details['attr']
150
-                  node.attr(extraction_details['attr'])
151
-                elsif extraction_details['text']
152
-                  node.text()
145
+                case
146
+                when css = extraction_details['css']
147
+                  nodes = doc.css(css)
148
+                when xpath = extraction_details['xpath']
149
+                  nodes = doc.xpath(xpath)
153 150
                 else
154
-                  error "'attr' or 'text' is required on HTML or XML extraction patterns"
151
+                  error "'css' or 'xpath' is required for HTML or XML extraction"
152
+                  return
153
+                end
154
+                unless Nokogiri::XML::NodeSet === nodes
155
+                  error "The result of HTML/XML extraction was not a NodeSet"
155 156
                   return
156 157
                 end
157
-              }
158
-              log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
158
+                result = nodes.map { |node|
159
+                  if extraction_details['attr']
160
+                    node.attr(extraction_details['attr'])
161
+                  elsif extraction_details['text']
162
+                    node.text()
163
+                  else
164
+                    error "'attr' or 'text' is required on HTML or XML extraction patterns"
165
+                    return
166
+                  end
167
+                }
168
+                log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
169
+              end
170
+              output[name] = result
159 171
             end
160
-            output[name] = result
161
-          end
162 172
 
163
-          num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
173
+            num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
164 174
 
165
-          if num_unique_lengths.length != 1
166
-            error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
167
-            return
168
-          end
169
-      
170
-          old_events = previous_payloads num_unique_lengths.first
171
-          num_unique_lengths.first.times do |index|
172
-            result = {}
173
-            options['extract'].keys.each do |name|
174
-              result[name] = output[name][index]
175
-              if name.to_s == 'url'
176
-                result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
177
-              end
175
+            if num_unique_lengths.length != 1
176
+              error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
177
+              return
178 178
             end
179
+        
180
+            old_events = previous_payloads num_unique_lengths.first
181
+            num_unique_lengths.first.times do |index|
182
+              result = {}
183
+              options['extract'].keys.each do |name|
184
+                result[name] = output[name][index]
185
+                if name.to_s == 'url'
186
+                  result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
187
+                end
188
+              end
179 189
 
180
-            if store_payload!(old_events, result)
181
-              log "Storing new parsed result for '#{name}': #{result.inspect}"
182
-              create_event :payload => result
190
+              if store_payload!(old_events, result)
191
+                log "Storing new parsed result for '#{name}': #{result.inspect}"
192
+                create_event :payload => result
193
+              end
183 194
             end
184 195
           end
185 196
         end
197
+
198
+        hydra.queue request
199
+        hydra.run
186 200
       end
187
-      hydra.queue request
188
-      hydra.run
189 201
     end
190 202
 
191 203
     private

+ 24 - 0
spec/models/agents/website_agent_spec.rb

@@ -91,6 +91,30 @@ describe Agents::WebsiteAgent do
91 91
         @checker.check
92 92
         @checker.logs.first.message.should =~ /Got an uneven number of matches/
93 93
       end
94
+
95
+      it "should accept an array for url" do
96
+        @site['url'] = ["http://xkcd.com/1/", "http://xkcd.com/2/"]
97
+        @checker.options = @site
98
+        lambda { @checker.save! }.should_not raise_error;
99
+        lambda { @checker.check }.should_not raise_error;
100
+      end
101
+
102
+      it "should parse events from all urls in array" do
103
+        lambda {
104
+          @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"]
105
+          @site['mode'] = 'all'
106
+          @checker.options = @site
107
+          @checker.check
108
+        }.should change { Event.count }.by(2)
109
+      end
110
+
111
+      it "should follow unique rules when parsing array of urls" do
112
+        lambda {
113
+          @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"]
114
+          @checker.options = @site
115
+          @checker.check
116
+        }.should change { Event.count }.by(1)
117
+      end
94 118
     end
95 119
 
96 120
     describe 'encoding' do